\documentclass{article}
\usepackage{enumerate}
\usepackage{times}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{amsfonts}
\newcommand{\argmax}{\operatornamewithlimits{argmax}}
\newcommand{\argmin}{\operatornamewithlimits{argmin}}
\newcommand{\myre}{\nonumber\\}
\newcommand{\mysp}{\,\,\,}
\newcommand{\pexp}[1]{\mathrm{E}\left[#1 \right]}
\newcommand{\pvar}[1]{\mathrm{Var}\left[#1 \right]}
\newcommand{\md}[1]{\mathrm{d}\,#1}
\newcommand{\mpd}[1]{\partial\,#1}
\newcommand{\mfracd}[2]{\frac{\md{#1}}{\md{#2}}}
\newcommand{\mfracpd}[2]{\frac{\mpd{#1}}{\mpd{#2}}}
\newcommand{\mvec}[1]{{#1}_1,\ldots,{#1}_n}
\newtheorem{mthm}{Theorem}
\newtheorem{mlem}{Lemma}
\newtheorem{mrem}{Remark}
\newtheorem{mdef}{Definition}
\newtheorem{mcor}{Corollary}

\usepackage[normalbib]{savetrees}


\title{Theorems database}
\author{Han Xiao}
\date{\today}

\begin{document}
\maketitle


\begin{mdef}[1]
  The set, $S$, of all possible outcomes of a particular experiment is called
  the sample space for the experiment.
\end{mdef}

\begin{mdef}[2]
  An event is any collection of possible outcomes of an experiment, that is, any
  subset of $S$ (including $S$ itself).
\end{mdef}


\begin{mthm}[3]
  For any three events, $A$, $B$ and $C$, defined on a sample space $S$,
    \begin{enumerate}
      \item Commutativity
        
        \begin{eqnarray}
        A \cap B= B \cap A,\myre
        A \cup B= B\cup A;\nonumber    
        \end{eqnarray}
      \item Associativity
        \begin{eqnarray}
        A\cup (B\cup C)=(A\cup B)\cup C,\myre
        A\cap (B\cap C)=(A\cap B)\cap C;\nonumber
        \end{eqnarray}
      \item Distributive Laws
        \begin{eqnarray}
        A\cap (B\cup C)=(A\cap B)\cup (A\cap C),\myre
        A\cup (B\cap C)=(A\cup B)\cap (A\cup C).\nonumber
        \end{eqnarray}         
    \end{enumerate}
\end{mthm}

\begin{mdef}[5]
Two events $A$ and $B$ are disjoint (or mutually exclusive) if
$A\cap B=\emptyset$. The events $A_{1},A_{2},\ldots$ are pairwise disjoint (or
mutually exclusive) if $A_{i}\cap A_{j}=\emptyset$ for all $i\neq j$.
\end{mdef}

\begin{mdef}[5]
If $A_{1},A_{2},\ldots$ are pairwise disjoint and $\cup_{i=1}^{\infty}A_{i}=S$,
then the collection $A_{1},A_{2},\ldots$ forms a partition of $S$.
\end{mdef}


\begin{mdef}[6]
A collection of subsets of $S$ is called a \emph{sigma algebra} (or Borel
field), denoted by $\mathcal{B}$, if it satisfies the following three
properties:
    \begin{enumerate}
      \item $\emptyset \in \mathcal{B}$ (the empty set is an element of $\mathcal{B}$).
      \item If $A\in \mathcal{B}$, then $A^{C}\in \mathcal{B}$ (B is closed
        under complementation).
      \item If $A_{1},A_{2},\ldots \in \mathcal{B}$, then
        $\cup_{i=1}^{\infty}A_{i}\in \mathcal{B}$ ($\mathcal{B}$ is closed under
        countable unions).
    \end{enumerate}
\end{mdef}

\begin{mdef}[7]
Given a sample space $S$ and an associated sigma algebra $\mathcal{B}$, a
\emph{probability function} is a function $P$ with domain $\mathcal{B}$ that
satisfies
    \begin{enumerate}
      \item $P(A)\geq 0$ for all $A\in \mathcal{B}$.
      \item $P(S)=1$.
      \item If $A_{1},A_{2},\ldots \in \mathcal{B}$ are pairwise disjoint, then 
        $P(\cup_{i=1}^{\infty}A_{i})=\sum_{i=1}^{\infty}P(A_{i})$.
    \end{enumerate}
\end{mdef}

\begin{mthm}[7]
Let $S=\{s_{1},\ldots,s_{n}\}$ be a finite set. Let $\mathcal{B}$ be any sigma
algebra of subsets of $S$. Let $p_{1},\ldots,p_{n}$ be nonnegative numbers that
sum to $1$. For any $A\in \mathcal{B}$, define $P(A)$ by:
\[
P(A)=\sum_{\{i:s_{i}\in A\}}p_{i}.
\]
(The sum over an empty set is defined to be $0$.) Then $P$ is a probability
function on $\mathcal{B}$. This remains true if $S=\{s_{1},\ldots,s_{n}\}$ is a
countable set.
\end{mthm}

\begin{mthm}[10]
If $P$ is a probability function and $A$ is any set in $\mathcal{B}$, then
    \begin{enumerate}
      \item $P(\emptyset)=0$;
      \item $P(A)\leq1$;
      \item $P(A^{C})=1-P(A)$.
    \end{enumerate}
\end{mthm}

\begin{mthm}[10]
If $P$ is a probability function and $A$ and $B$ are any set in $\mathcal{B}$, then
    \begin{enumerate}
      \item $P(B\cap A^{C})=P(B)-P(A\cap B)$;
      \item $P(A\cup B)=P(A)+P(B)-P(A\cap B)$;
      \item If $A\subset B$, then $P(A)\leq P(B)$.
    \end{enumerate}
\end{mthm}

\begin{mthm}[11]
If $P$ is a probability function, then
    \begin{enumerate}
      \item For any partition $C_{1},C_{2},\ldots$ \[P(A)=\sum_{i=1}^{\infty}P(A\cap C_{i});\]
      \item For any sets $A_{1},A_{2},\ldots$. (Boole's
        Inequality)\[P(\cup_{i=1}^{\infty}A_{i})\leq
        \sum_{i=1}^{\infty}P(A_{i});\]
      \item For any sets $A_{1},A_{2},\ldots$. (Bonferroni
        Inequality)\[P(\cap_{i=1}^{n}A_{i})\geq\sum_{i=1}^{n}P(A_{i})-(n-1).\]
    \end{enumerate}
\end{mthm}

\begin{mthm}[13]
  If a job consists of $k$ separate tasks, the $i$th of which can be done in
  $n_{i}$ ways, $i=1,\ldots,k$, then the entire job can be done in $n_{1}\times
  n_{2} \times\ldots\times n_{k}$ ways.
\end{mthm}

\begin{mrem}[16]
  Ball picking problem.

  \begin{table}[!htb]
      \centering
  \begin{tabular}{l|cc}
    \hline
  Type  &   Without replacement & With replacement\\
  \hline
 Ordered   & $\frac{n!}{(n-r)!}$           &   $n^{r}$\\
 Unordered & $\binom{n}{r}$             &   $\binom{n+r-1}{r}$\\
 \hline
\end{tabular}
\caption{Number of possible arrangements of size $r$ from $n$ objects.\label{tbl:ballpk}}
\end{table}
\end{mrem}

\begin{mthm}[25]
If $A$ and $B$ are independent events, then the following pairs are also
independent:
    \begin{enumerate}
      \item $A$ and $B^{C}$,
      \item $A^{C}$ and $B$,
      \item $A^{C}$ and $B^{C}$.
    \end{enumerate}
\end{mthm}

\begin{mdef}[27]
  A \emph{random variable} is a function from a sample space $S$ into the real
  numbers.
\end{mdef}

\begin{mthm}[31]
The function $F(x)$ is a cdf if and only if the following three conditions hold:
    \begin{enumerate}
      \item $\lim_{x\rightarrow -\infty}F(x)=0$ and $\lim_{x\rightarrow \infty}F(x)=1$.
      \item $F(x)$ is a nondecreasing function of $x$.
      \item $F(x)$ is a right-continuous; that is, for every number $x_{0}$,
        $\lim_{x\rightarrow x_{0}}F(x)=F(x_{0})$.
    \end{enumerate}
\end{mthm}

\begin{mdef}[33]
The random variables $X$ and $Y$ are identically distributed if, for every set
$A\in\mathcal{B}$, $P(X\in A)=P(Y\in A)$.
\end{mdef}

\begin{mthm}[34]
The following two statements are equivalent:
    \begin{enumerate}
      \item The random variables $X$ and $Y$ are identically distributed.
      \item $F_{X}(x)=F_{Y}(y)$ for every $x$.
    \end{enumerate}
\end{mthm}

\begin{mdef}[34]
The \emph{probability mass function} (pmf) of a discrete random variable $X$ is
given by:
\[
f_{X}(x)=P(X=x)\,\,\,\mathrm{for}\,\,\mathrm{all}\,\,\,x.
\]
\end{mdef}

\begin{mdef}[35]
The \emph{probability density function or pdf}, $f_{X}(x)$, of a continuous
random variable $X$ is the function that satisfies:
\[
F_{X}(x)=\int_{-\infty}^{x}f_{X}(t)\mathrm{d}t\,\,\,\mathrm{for}\,\,\mathrm{all}\,\,\,x.
\]
\end{mdef}

\begin{mrem}[36]
The expression ``$X$ has a distribution given by $F_{X}(x)$'' is abbreviated
symbolically by $X\sim F_{X}(x)$, where we read the symbol $\sim$ as ``is
distributed as.'' We use an uppercase letter for the cdf and the corresponding
lowercase letter for the pmf or pdf.

We can similarly write $X\sim f_{X}(x)$ or, if $X$ and $Y$ have the same
distribution, $X\sim Y$.
\end{mrem}

\begin{mthm}[36]
A function $f_{X}(x)$ is a pdf (or pmf) of a random variable $X$ is and only if:
    \begin{enumerate}
      \item $f_{X}(x)\geq 0$ for all $x$.
      \item $\sum_{x}f_{X}(x)=1$ (pmf) or
        $\int_{-\infty}^{\infty}f_{X}(x)\mathrm{d}x=1$ (pdf).
    \end{enumerate}
\end{mthm}


\begin{mrem}[37]
From a purely mathematical viewpoint, any nonnegative function with a finite
positive integral (or sum) can be turned into a pdf or pmf. For example, if
$h(x)$ is any nonnegative function that is positive on a set $A$, 0 elsewhere,
and:
\[
\int_{\{x\in A\}}h(x)\mathrm{d}x=K<\infty
\]
for some constant $K>0$, then the function $f_{X}(x)=h(x)/K$ is a pdf of a
random variable $X$ taking values in $A$.
\end{mrem}

\begin{mthm}[51]
  Let $X$ have cdf $F_{X}(x)$, let $Y=g(X)$, and let $X$ and $Y$ be defined as:
  \[
  X=\{x:f_{X}(x)>0\}\textrm{ and }Y=\{y:y=g(x)\textrm{ for some }x\in X\}
  \]
  \textbf{a.} If $g$ is an increasing function on $X$ (e.g. the first derivative
  $>0$ on $X$), then $F_{Y}(y)=F_{X}(g^{-1}(y))$ for $y\in Y$.\\
  \textbf{b.} If $g$ is a decreasing function on $X$ and $X$ is a continuous
  random variable, $F_{Y}(y)=1-F_{X}(g^{-1}(y))$ for $y \in Y$.
\end{mthm}

\begin{mthm}[51]
  Let $X$ have pdf $f_{X}(x)$, let $Y=g(X)$, where $g$ is a \textbf{monotone} function.
  Let $X$ and $Y$ be defined as:
  \[
  X=\{x:f_{X}(x)>0\}\textrm{ and }Y=\{y:y=g(x)\textrm{ for some }x\in X\}
  \]
  Suppose that $f_{X}(x)$ is continuous on $X$ and that $g^{-1}(y)$ has a
  continuous derivative on $Y$. Then the pdf of $Y$ is given by:
  
  \begin{eqnarray}
      f_{Y}(y)=\left\{ \begin{matrix}
      f_{X}(g^{-1}(y))\left|\frac{\mathrm{d}}{\mathrm{d}y}g^{-1}(y)\right| & y\in Y,\\
      0 & \mathrm{otherwise}. \end{matrix}\right.\nonumber
  \end{eqnarray}


  Further note that, in many applications function $g$ may be neither increasing
  or decreasing, hence the above results will not apply. However, it is often
  the case that \textbf{$g$ will be monotone over certain intervals} and that
  allows us to get an expression for $Y=g(X)$.


\end{mthm}


\begin{mthm}[53]
  Let $X$ have pdf $f_{X}(x)$, let $Y=g(X)$, where $g$ is a \textbf{monotone} function.
  Let $X$ and $Y$ be defined as:
  \[
  X=\{x:f_{X}(x)>0\}\textrm{ and }Y=\{y:y=g(x)\textrm{ for some }x\in X\}
  \]
  Suppose there exists a partition, $A_{0},A_{1},\ldots,A_{k}$ such that $P(X\in
  A_{0})=0$ and $f_{X}(x)$ is continuous on each $A_{i}$. Further, suppose there
  exist function $g_{1}(x),\ldots,g_{k}(x)$, defined on $A_{1},\ldots,A_{k}$,
  respectively, satisfying:
  
\begin{enumerate}
\item $g(x)=g_{i}(x)$, for $x\in A_{i}$,
  \item $g_{i}(x)$ is monotone on $A_{i}$,
    \item the set $Y=\{y:y=g_{i}(x)\,\,\,\mathrm{for}\,\,\mathrm{some}x\in
      A_{i}\}$ is the same for each $i=1,\ldots,k$, and
      \item $g_{i}^{-1}(y)$ has a continuous derivative on $Y$, for each
        $i=1,\ldots,k$.
        
\end{enumerate}

Then,
  \begin{eqnarray}
      f_{Y}(y)=\left\{ \begin{matrix}
      \sum_{i=1}^{k}f_{X}(g_{i}^{-1}(y))\left|\frac{\mathrm{d}}{\mathrm{d}y}g_{i}^{-1}(y)\right| & y\in Y,\\
      0 & \mathrm{otherwise}. \end{matrix}\right.\nonumber
  \end{eqnarray}

\end{mthm}

\begin{mthm}[54]
\textbf{Probability integral transformation}. Let $X$ have continuous cdf
$F_{X}(x)$ and define the random variable $Y$ as $Y=F_{X}(X)$. Then $Y$ is
uniformly distributed on $(0,1)$, that is, $P(Y\leq y)=y, 0<y<1$.
\end{mthm}

\begin{mrem}[55]
  One application of probability integral transformation is in the generation of
  random samples from a particular distribution. If it is required to generate an
  observation $X$ from a population with cdf $F_{X}$, we need only generate a
  uniform random number $V$, between $0$ and $1$, and solve for $x$ in the
  equation $F_{X}(x)=u$.
\end{mrem}


\begin{mdef}[55]
The \emph{expected value} or mean of a random variable $g(X)$, denoted by
$\pexp{g(X)}$, is

  \begin{eqnarray}
      \pexp{g(X)}=\left\{ \begin{matrix}
      \int_{-\infty}^{\infty}g(x)f_{X}(x)\,\mathrm{d}x & \mathrm{if}\,\,X\,\,
      \mathrm{is\,\,continuous},\\
      \sum_{x\in\mathcal{X}}g(x)f_{X}(x) & \mathrm{if}\,\,X\,\,
      \mathrm{is\,\,discrete}. \end{matrix}\right.\nonumber
  \end{eqnarray}
  provided that the integral or sum exists. If $\pexp{g(X)}=\infty$, we say that
  $\pexp{g(X)}$ does not exist.
\end{mdef}

\begin{mthm}[57]
The process of taking expectations is a linear operation. Let$X$ be a random
variable and let $a,b$ and $c$ be constants. Then for any functions $g_{1}(x)$
and $g_{2}(x)$ whose expectations exists,
\begin{enumerate}
  \item $\pexp{ag_{1}(X)+bg_{2}(X)+c}=a\pexp{g_{1}(X)}+b\pexp{g_{2}(X)}+c$.
  \item If $g_{1}(x)\geq 0$ for all $x$, then $\pexp{g_{1}(X)}\geq 0$.
  \item If $g_{1}(x)\geq g_{2}(x)$ for all $x$, then $\pexp{g_{1}(X)}\geq
    \pexp{g_{2}(X)}$.
  \item If $a\leq g_{1}(x)\leq b$ for all $x$, then $a\leq\pexp{g_{1}(X)}\leq
    b$.
\end{enumerate}
\end{mthm}

\begin{mrem}[58]
When evaluating expectations of nonlinear functions of $X$, we can proceed in
one of two ways. From the definition of $\pexp{g(X)}$, we could directly
calculate
\[
\pexp{g(X)}=\int_{-\infty}^{\infty}g(x)f_{X}(x)\mathrm{d}x.
\]
But we could also find the pdf $f_{Y}(y)$ of $Y=g(X)$ and we would have
\[
\pexp{g(X)}=\pexp{Y}=\int_{-\infty}^{\infty}yf_{Y}(y)\mathrm{d}y.
\]
\end{mrem}

\begin{mdef}[59]
For each integer $n$, the $n$th \emph{moment} of $X$ (or $F_{X}(x)$),
$\mu_{n}'$, is
\[
\mu_{n}'=\pexp{X^{n}}.
\]
The $n$th central moment of $X$, $\mu_{n}$, is
\[
\mu_{n}=\pexp{(X-\mu)^{n}}
\]
where $\mu=\mu_{1}'=\pexp{X}$.
\end{mdef}

\begin{mdef}[59]
The variance of a random variable $X$ is its second central moment,
$\pvar{X}=\pexp{(X-\pexp{X})^{2}}$. The positive square root of $\pvar{X}$ is
the standard deviation of $X$.
\end{mdef}

\begin{mthm}[61]
If $X$ is a random variable with finite variance, then for any constants $a$ and
$b$,
\[
\pvar{aX+b}=a^{2}\pvar{X}.
\]
\end{mthm}

\begin{mdef}[62]
Let $X$ be a random variable with cdf $F_{X}$. The moment generating function
(mgf) of $X$ (or $F_{X}$), denoted by $M_{X}(t)$, is
\[
M_{X}(t)=\pexp{e^{tX}}=\left\{ \begin{matrix}
      \int_{-\infty}^{\infty}e^{tx}f_{X}(x)\,\mathrm{d}x & \mathrm{if}\,\,X\,\,
      \mathrm{is\,\,continuous},\\
      \sum_{x}e^{tx}P(X=x) & \mathrm{if}\,\,X\,\,
      \mathrm{is\,\,discrete}. \end{matrix}\right.\nonumber,
\]
provided that the expectation exists for $t$ in some neighborhood of $0$.
\end{mdef}

\begin{mthm}[62]
If $X$ has mgf $M_{x}(t)$, then
\[
\pexp{X^{n}}=M_{X}^{(n)}(0),
\]
where we define
\[
M_{X}^{(n)}(0)=\frac{\mathrm{d}^{n}}{\mathrm{d}\,t^{n}}M_{x}(t)\bigg|_{t=0}.
\]
That is, the $n$th moment is equal to the $n$th derivative of $M_{X}(t)$
evaluated at $t=0$.
\end{mthm}

\begin{mdef}[63]
The \emph{kernel} of a function is the main part of the function,the part that
remains when constants are disregarded.
\end{mdef}

\begin{mthm}[65]
Characterizing the set of moments is not enough to determine a distribution
uniquely because there may be two distinct random variables having the same
moments. If the random variables have \emph{bounded support}, we don't have this
problem.

Let $F_{X}(x)$ and $F_{Y}(y)$ be two cdfs all of whose moments exist.
\begin{enumerate}
\item If $X$ and $Y$ have bounded support, then $F_{X}(u)=F_{Y}(u)$ for all $u$
  if and only if $\pexp{X^{r}}=\pexp{Y^{r}}$ for all integers $r=0,1,2,\ldots$.
\item If the moment generating functions exist and $M_{X}(t)=M_{Y}(t)$ for all
  $t$ in some neighborhood of $0$, then $F_{X}(u)=F_{Y}(u)$ for all $u$.
\end{enumerate}
\end{mthm}


\begin{mthm}[66]
Suppose $\{X_{i},i=1,2,\ldots\}$ is a sequence of random variables, each with
mgf $M_{X_{i}}(t)$. Furthermore, suppose that
\[
\lim_{t\leftarrow\infty}M_{X_{i}}(t)=M_{X}(t),\,\,\,\mbox{for all t in a
  neighborhood of } 0,
\]
and $M_{X}(t)$ is an mgf. Then there is a unique cdf $F_{X}$ whose moments are
determined by $M_{X}(t)$ and, for all $x$ where $F_{X}(t)$ is continuous, we
have
\[
\lim_{i\leftarrow\infty}F_{X_{i}}(x)=F_{X}(x).
\]
That is, convergence, for $|t|<h$, of mgfs to an mgf implies convergences of cdfs.
\end{mthm}


\begin{mlem}[67]
Let $a_{1},a_{2},\ldots$ be a sequence of numbers converging to $a$, that is,
$\lim_{n\leftarrow\infty}a_{n}=a$. Then,
\[
\lim_{n\leftarrow\infty}\left(1+\frac{a_{n}}{n}\right)^{n}=e^{a}.
\]
\end{mlem}

\begin{mthm}[67]
For any constants $a$ and $b$, the mgf of the random variable $aX+b$ is given by
\[
M_{aX+b}(t)=e^{bt}M_{X}(at).
\]
\end{mthm}

\begin{mthm}[69]
\textbf{Leibnitz's Rule}. If $f(x,\theta),a(\theta)$ and $b(\theta)$ are
differentiable with respect to $\theta$, then
\[
\frac{\mathrm{d}}{\mathrm{d}\theta}\int_{a(\theta)}^{b(\theta)}f(x,\theta)=f(b(\theta),\theta)\frac{\mathrm{d}}{\mathrm{d}\theta}b(\theta)-f(a(\theta),\theta)\frac{\mathrm{d}}{\mathrm{d}\theta}a(\theta)+\int_{a(\theta)}^{b(\theta)}\frac{\partial}{\partial\theta}f(x,\theta)\mathrm{d}x.
\]
\end{mthm}
\begin{mthm}[69]
Suppose the function $h(x,y)$ is continuous at $y_{0}$ for each $x$, and there
exists a function $g(x)$ satisfying
\begin{enumerate}
\item $|h(x,y)|\leq g(x)$ for all $x$ and $y$,
\item $\int_{\infty}^{\infty}g(x)\mathrm{d}\,x<\infty$.
\end{enumerate}
Then,
\[
\lim_{y\leftarrow y_{0}}\int_{-\infty}^{\infty}h(x,y)\md{x}=\int_{-\infty}^{\infty}h(x,y)\md{x}.
\]

\end{mthm}

\begin{mthm}[70]
Suppose $f(x,\theta)$ is differentiable at $\theta=\theta_{0}$ exists for every
$x$, and there exists a function $g(x,\theta_{0})$ and a constant $\delta_{0}>0$
such that
\begin{enumerate}
\item $\left|\frac{f(x,\theta_{0}+\delta)-f(x,\theta_{0})}{\delta}\right|\leq g(x,\theta_{)})$ for all $x$ and $|\delta|\leq\delta_{0}$,
\item $\int_{\infty}^{\infty}g(x,\theta_{0})\mathrm{d}\,x<\infty$.
\end{enumerate}
Then,
\[
\frac{\md{}}{\md{\theta}}\int_{-\infty}^{\infty}f(x,\theta)\md{x}\Big|_{\theta=\theta_{0}}=\int_{-\infty}^{\infty}\left[\frac{\partial}{\partial\theta}f(x,\theta)\Big|_{\theta=\theta_{0}}\right]\md{x}.
\]
\end{mthm}

\begin{mthm}[74]
Suppose that the series $\sum_{x=0}^{\infty}h(\theta,x)$ converges for all
$\theta$ in an interval $(a,b)$ of real number and
\begin{enumerate}
\item $\frac{\partial}{\partial{\theta}}h(\theta,x)$ is a continuous in $\theta$
  for each $x$,
\item $\sum_{x=0}^{\infty}\frac{\partial}{\partial{\theta}}h(\theta,x)$
  converges uniformly on every closed bounded subinterval of $(a,b)$.
\end{enumerate}
Then,
\[
\frac{\md{}}{\md{\theta}}\sum_{x=0}^{\infty}h(\theta,x)=\sum_{x=0}^{\infty}\frac{\partial}{\partial{\theta}}h(\theta,x).
\]
\end{mthm}

\begin{mthm}[75]
Suppose that the series $\sum_{x=0}^{\infty}h(\theta,x)$ converges uniformly on
$[a,b]$ and that, for each $x$, $h(\theta,x)$ is a continuous function of
$\theta$. Then
\[
\int_{a}^{b}\sum_{x=0}^{\infty}h(\theta,x)\md{\theta}=\sum_{x=0}^{\infty}\int_{a}^{b}h(\theta,x)\md{\theta}.
\]
\end{mthm}

\begin{mthm}[90]
For any real numbers $x$ and $y$ and integer $n>0$,
\[
(x+y)^{n}=\sum_{i=0}^{n}\left(\begin{matrix}
      n\\
      i \end{matrix}\right)x^{i}y^{n-i}.
\]
\end{mthm}

\begin{mdef}[111]
A family of pdfs or pmfs is called an exponential family if it can be expressed
as
\[
f(x|\mathbf{\theta})=h(x)c(\theta)\exp\left(\sum_{i=1}^{k}w_{i}(\mathbf{\theta})t_{i}(x)\right).
\]
Here $h(x)$ and $t_{1}(x),\ldots,t_{k}(x)$ are real-valued functions of the
observation $x$ (they can not depend on $\mathbf{\theta}$), and
$c(\mathbf{\theta})\geq 0$ and
$w_{1}(\mathbf{\theta}),\ldots,w_{k}(\mathbf{\theta})$ are real-valued functions
of the possibily vector-valued parameter $\mathbf{\theta}$ (they can not depend
on $x$).
\end{mdef}

\begin{mthm}[112]
If $X$ is a random variable with pdf or pmf of the exponential family, then
\begin{eqnarray*}
  \pexp{\sum_{i=1}^{k}\mfracpd{w_{i}(\mathbf{\theta})}{\theta_{j}}t_{i}(X)}&=&-\mfracpd{}{\theta_{j}}\log
c(\mathbf{\theta});\\
  \pvar{\sum_{i=1}^{k}\mfracpd{w_{i}(\mathbf{\theta})}{\theta_{j}}t_{i}(X)}&=&-\mfracpd{^{2}}{\theta_{j}^{2}}\log
c(\mathbf{\theta})-\pexp{\sum_{i=1}^{k}\mfracpd{^{2}}{\theta_{j}^{2}}t_{i}(X)}.
\end{eqnarray*}
The advantage is that when calculating $\pexp{X}$ we can replace integration or
summation by differentiation, which is often more straightforward.
\end{mthm}

\begin{mrem}[114]
An exponential family is sometimes reparameterized as
\[
f(x|\eta)=h(x)c*(\eta)\exp\left(\sum_{i=1}^{k}\eta_{i}t_{i}(x)\right).
\]
The set
\[
\mathcal{H}=\left\{\eta=(\eta_{1},\ldots,\eta_{k}):\int_{-\infty}^{\infty}\exp\left(\sum_{i=1}^{k}\eta_{i}t_{i}(x)\right)\mathrm{d}x<\infty\right\}
\]
is called the \emph{natural parameter space} for the family (a convex space).
For the values of $\eta\in \mathcal{H}$, we must have
\[
c*(\eta)=\left[\int_{\infty}^{\infty}h(x)\exp\left(\sum_{i=1}^{k}\eta_{i}t_{i}(x)\right)\mathrm{d}x\right]^{-1}
\]
to ensure the pdf integrates to $1$.
\end{mrem}

\begin{mdef}[115]
A \emph{curved exponential family} is a family of densities for which the
dimension of the vector $\theta$ is equal to $d<k$. If $d=k$, the family is a
full exponential family.
\end{mdef}

\begin{mthm}[116]
  Let $f(x)$ be any pdf and let $\mu$ and $\delta>0$ be any given constants.Then
  the function
  \[
  g(x|\mu,\delta)=\frac{1}{\delta}f\left(\frac{x-\mu}{\delta}\right)
  \]
is a pdf.
\end{mthm}

\begin{mdef}[116]
  Let f(x) be any pdf. Then the family of pdfs $f(x-\mu)$, indexed by the
  parameter $\mu$, $-\infty<\mu<\infty$, is called the \emph{location family
    with standard pdf} $f(x)$ and $\mu$ is called the \emph{location parameter}
  for the family.
\end{mdef}

\begin{mdef}[119]
  Let f(x) be any pdf. Then for any $\delta>0$, the family of pdfs
  $(1/\delta)f(x/\delta)$, indexed by the parameter $\delta$, is called the
  scale family with standard pdf $f(x)$ and $\delta$ is called the \emph{scale
  parameter} of the family.
\end{mdef}

\begin{mdef}[119]
Let $f(x)$ be any pdf. Then for any $\mu$, $-\infty<\mu<\infty$, and any
$\delta>0$, the family of pdfs $(1/\delta)f((x-\mu)/\delta)$, indexed by the
parameter $(\mu,\delta)$, is called the \emph{location-scale} family with
standard pdf $f(x)$; $\mu$ is called the location parameter and $\delta$ is
called the scale parameter.
\end{mdef}

\begin{mthm}[120]
Let $f(\cdot)$ be any pdf. Let $\mu$ be any real number, and let $\delta$ be any
positive real number. Then $X$ is a random variable with pdf
$(1/\delta)f((x-\mu)/\delta)$ if and only if there exists a random variable $Z$
with pdf $f(z)$ and $X=\delta Z+\mu$.
\end{mthm}

\begin{mthm}[121]
  Let $Z$ be a random variable with pdf $f(z)$. Suppose $\pexp{Z}$ and
  $\pvar{Z}$ exist. If $X$ is a random variable with pdf
  $(1/\delta)f((x-\mu)/\delta)$, then
  \[
  \pexp{X}=\delta\pexp{Z}+\mu\mbox{   and   } \pvar{X}=\delta^{2}\pvar{Z}.
  \]
  In particular, if $\pexp{Z}=0$ and $\pvar{Z}=1$, then $\pexp{X}=\mu$ and
  $\pvar{X}=\delta^{2}$.
\end{mthm}


\begin{mthm}[122]
  \textbf{Chebychev's Inequality}. Let $X$ be a random variable and let $g(x)$
  be a nonnegative function. Then for any $r>0$,
\[
P(g(X)\geq r)\leq \frac{\pexp{g(X)}}{r}.
\]
While Chebychev's Inequality is widely applicable, it is necessarily
conservative. In particular, we can often get tighter bounds for some specific
distributions.
\end{mthm}

\begin{mrem}[123]
When the mgf of $X$ exists,
  \[
  P(X\geq a)\leq e^{-at}M_{X}(t).
  \]
\end{mrem}

\begin{mthm}[124]
Let $X_{\alpha,\beta}$ denote a $gamma(\alpha,\beta)$ random variable with pdf
$f(x|\alpha,\beta)$, where $\alpha>1$. Then for any constants $a$ and $b$,
\[
P(a<X_{\alpha,\beta}<b)=\beta(f(a|\alpha,\beta)-f(b|\alpha,\beta))+P(a<X_{\alpha-1,\beta}<b).
\]
\end{mthm}


\begin{mlem}[124]
\textbf{Stein's Lemma}. Let $X\sim n(\theta,\delta^{2})$, and let $g$ be a
differentiable function satisfying $\pexp{|g'(X)|}<\infty$. Then
\[
\pexp{g(X)(X-\theta)}=\delta^{2}\pexp{g'(X)}.
\]
\end{mlem}

\begin{mthm}[125]
Let $\chi_{p}^{2}$ denote a chi squared random variable with $p$ degrees of
freedom. For any function $h(x)$,
\[
\pexp{h(\chi_{p}^{2})}=p \pexp{\left(\frac{h(\chi_{p+2}^{2})}{\chi^{2}_{p+2}}\right)}
\]
provided the expectations exist.
\end{mthm}

\begin{mthm}[126]
\textbf{Hwang}. Let $g(x)$ be a function with $-\infty<\pexp{g(X)}<\infty$ and
$-\infty<g(-1)<\infty$. Then:
\begin{enumerate}
\item If $X\sim Poisson(\lambda)$,
  \[\pexp{\lambda g(X)}=\pexp{X g(X-1)}\].
\item If $X\sim negative binomial(r,p)$,
  \[\pexp{(1-p)g(X)}=\pexp{\frac{X}{r+X-1}g(X-1)}\].
\end{enumerate}

\end{mthm}


\begin{mlem}[136]
\textbf{Markov's Inequality}. If $P(Y\geq 0)=1$ and $P(Y=0)<1$, then, for any
$r>0$,
\[
P(Y\geq r)\leq \frac{\pexp{Y}}{r}
\]
with equality if and only if $P(Y=r)=p=1-P(Y=0),0<p\leq 1$.
\end{mlem}

\begin{mthm}[136]
\textbf{Gauss Inequality}. Let $X\sim f$, where $f$ is unimodal with mode $\nu$,
and define $\tau^{2}=\pexp{X-\nu}^{2}$. Then
\[
P(|X-\nu|>\varepsilon)\leq\left\{ \begin{matrix}
      \frac{4\tau^{2}}{9\varepsilon^{2}} & \mbox{for all } \varepsilon\geq \sqrt{4/3}\tau\\
      1-\frac{\varepsilon}{\sqrt{3}\tau} & \mbox{for all }
      \varepsilon\leq \sqrt{4/3}\tau \end{matrix}\right.
\]
\end{mthm}

\begin{mthm}[137]
\textbf{Vysochanskii-Pentunin Inequality}. Let $X\sim f$, where $f$ is unimodal,
and define $\xi^{2}=\pexp{(X-\alpha)^{2}}$ for an arbitrary point $\alpha$. Then
\[
P(|X-\alpha|>\varepsilon)\leq\left\{ \begin{matrix}
      \frac{4\xi^{2}}{9\varepsilon^{2}} & \mbox{for all } \varepsilon\geq \sqrt{8/3}\xi\\
      \frac{4\xi^{2}}{9\epsilon^{2}}-\frac{1}{3} & \mbox{for all }
      \varepsilon\leq \sqrt{8/3}\xi \end{matrix}\right.
\]
\end{mthm}

\begin{mdef}[139]
An $n$-dimensional random vector is a function from a sample space $S$ into
$\mathcal{R}^{n}$, n-dimensional Euclidean space.
\end{mdef}

\begin{mlem}[152]
Let $(X,Y)$ be a bivariate random vector with joint pdf or pmf $f(x,y)$. Then
$X$ and $Y$ are independent random variables if and only if there exist
functions $g(x)$ and $h(y)$ such that, for every $x\in \mathcal{R}$ and $y\in
\mathcal{R}$,
\[
f(x,y)=g(x)h(y)
\]
\end{mlem}

\begin{mthm}[154]
Let $X$ and $Y$ be independent random variables.
\begin{enumerate}
\item For any $A\subset \mathcal{R}$ and $B\subset \mathcal{R}$, $P(X\in A, Y\in
  B)=P(X\in A)P(Y\in B)$; that is, the events $\{X\in A\}$ and $\{Y\in B\}$ are
  independent events.
\item Let $g(x)$ be a function only of $x$ and $h(y)$ be a function only of $y$.
  Then
  \[
  \pexp{g(X)h(Y)}=\pexp{g(X)}\pexp{h(Y)}.
  \]
\end{enumerate}
\end{mthm}

\begin{mthm}[155]
Let $X$ and $Y$ be independent random variables with moment generating functions
$M_{X}(t)$ and $M_{Y}(t)$. Then the moment generating function of the random
variable $Z=X+Y$ is given by
\[
M_{Z}(t)=M_{X}(t)M_{Y}(t).
\]
\end{mthm}

\begin{mthm}[156]
Let $X\sim n(\mu,\sigma^{2})$ and $Y\sim n(\gamma, \tau^{2})$ be independent
normal random variables. Then the random variable $Z=X+Y$ has a
$n(\mu+\gamma,\sigma^{2}+\tau^{2})$ distribution.
\end{mthm}

\begin{mthm}[158]
If $X\sim Poisson(\theta)$ and $Y\sim Poisson(\lambda)$ and $X$ and $Y$ are
independent, then $X+Y\sim Poisson(\theta+\lambda)$.
\end{mthm}

\begin{mthm}[161]
Let $X$ and $Y$ be independent random variables. Let $g(x)$ be a function only
of $x$ and $h(y)$ be a function only of $y$. The the random variables $U=g(X)$
and $V=h(Y)$ are independent.
\end{mthm}

\begin{mthm}[164]
If $X$ and $Y$ are any two random variables, then
\[
\pexp{X}=\pexp{\pexp{X|Y}},
\]
provided that the expectations exist.
\end{mthm}

\begin{mdef}[165]
A random variable $X$ is said to have a \emph{mixture distribution} if the distribution
of $X$ depends on a quantity that also has a distribution.
\end{mdef}

\begin{mthm}[167]
\textbf{Conditional variance identity}. For any two random variables $X$ and
$Y$,
\[
\pvar{X}=\pexp{\pvar{X|Y}}+\pvar{\pexp{X|Y}},
\]
provided that the expectations exist.
\end{mthm}

\begin{mdef}[169]
The covariance of $X$ and $Y$ is the number defined by
\[
\mathrm{Cov}(X,Y)=\pexp{(X-\mu_{X})(Y-\mu_{Y})}.
\]
The correlation of $X$ and $Y$ is the number defined by
\[
\rho_{XY}=\frac{\mathrm{Cov}(X,Y)}{\sigma_{X}\sigma_{Y}}.
\]
The value $\rho$ is also called the \emph{correlation coefficient}.
\end{mdef}

\begin{mthm}[170]
For any random variables $X$ and $Y$,
\[
\mathrm{Cov}(X,Y)=\pexp{XY}-\mu_{X}\mu_{Y}.
\]
\end{mthm}


\begin{mthm}[171]
  If $X$ and $Y$ are independent random variables, then $\mathrm{Cov}(X,Y)=0$
  and $\rho_{XY}=0$.
\end{mthm}

\begin{mthm}[171]
Uf $X$ and $Y$ are any two random variables and $a$ and $b$ are any two
constants, then
\[
\pvar{aX+bY}=a^{2}\pvar{X}+b^{2}\pvar{Y}+2ab\mathrm{Cov}(X,Y).
\]
If $X$ and $Y$ are independent random variables, then
\[
\pvar{aX+bY}=a^{2}\pvar{X}+b^{2}\pvar{Y}.
\]
\end{mthm}

\begin{mthm}[172]
For any random variables $X$ and $Y$,
\begin{enumerate}
\item $-1\leq\rho_{XY}\leq 1$.
  \item $|\rho_{XY}|=1$ if and only if there exist numbers $a\neq=0$ and $b$
    such that $P(Y=aX+b)=1$. If $\rho_{XY}=1$, then $a>0$, and if
    $\rho_{XY}=-1$, then $a<0$.
\end{enumerate}
\end{mthm}

\begin{mdef}[180]
Let $n$ and $m$ be positive integers and let $p_{1},\ldots,p_{n}$ be numbers
satisfying $0\leq p_{i}\leq 1, \, i=1,\ldots,n $ and $\sum_{i=1}^{n}p_{i}=1$.
Then the random vector $(X_{1},\ldots,X_{n})$ has a \emph{multinomial
  distribution} with $m$ trials and cell probabilities $p_{1},\ldots, p_{n}$ if
the joint pmf of $(X_{1},\ldots,X_{n})$ is
\[
f(x_{1},\ldots,x_{n})=\frac{m!}{x_{1}!\cdots
  x_{n}!}\,p_{1}^{x_{1}}\cdots
p_{n}^{x_{n}}=m!\prod_{i=1}^{n}\frac{p_{i}^{x_{i}}}{x_{i}!}
\]
on the set of $(x_{1},\ldots,x_{n})$ such that each $x_{i}$ is a nonnegative
integer and $\sum_{i=1}^{n}x_{i}=m$.
\end{mdef}


\begin{mthm}[181]
Let $m$ and $n$ be positive integers. Let $\mathcal{A}$ be the set of vectors
$\mathbf{x}=(x_{1},\ldots,x_{n})$ such that each $x_{i}$ is a nonnegative
integer and $\sum_{i=1}^{n}x_{i}=m$. Then, for any real numbers
$p_{1},\ldots,p_{n}$,
\[
(p_{1}+\cdots+p_{n})^{m}=\sum_{\mathbf{x}\in\mathcal{A}}\frac{m!}{x_{1}!\cdots
  x_{n}!}\,p_{1}^{x_{1}}\cdots p_{n}^{x_{n}}.
\]
\end{mthm}


\begin{mrem}[182]
For a fixed value of $x_{n}\in \{0,1,\ldots,n\}$, to compute the marginal pmf
$f(x_{n})$, we must sum over all possible values of $(x_{1},\ldots,x_{n-1})$.
Hence the marginal distribution o $X_{n}$ is $binomial(m,p_{n})$.


The conditional pmf of $(X_{1},\ldots, X_{n-1})$ given $X_{n}=x_{n}$ is the pmf
of a multinomial distribution with $m-x_{n}$ trials and cell probabilities
$p_{1}/(1-p_{n}),\ldots,p_{n-1}/(1-p_{n})$. In fact, the conditional
distribution of any subset of the coordinates of $(X_{1},\ldots,X_{n})$ given
the values of the rest of the coordinates is a multinomial distribution.
\end{mrem}


\begin{mrem}[182]
  For a mulitnomial distribution, the pairwise covariances are negative and are
  given by
  \[
\mathrm{Cov}(X,Y)=\pexp{(X_{i}-p_{i})(X_{j}-p_{j})}=-mp_{i}p_{j}.
  \]
\end{mrem}


\begin{mdef}[182]
Let $\mvec{\mathbf{X}}$ be a random vectors with joint pdf or pmf $f(\mvec{x})$.
Let $f_{\mathbf{X}_{i}}(\mathbf{x}_{i})$ denote the marginal pdf or pmf of $\mathbf{X}_{i}$. Then
$\mvec{\mathbf{X}}$ are called mutually independent random vectors if, for
every $(\mvec{\mathbf{x}})$,
  \[
  f(\mvec{\mathbf{x}})=f_{\mathbf{X}_{1}}(\mathbf{x}_{1})\cdots
  f_{\mathbf{X}_{n}}(\mathbf{x}_{n})=\prod_{i=1}^{n}f_{\mathbf{X}_{i}}(\mathbf{x}_{i}).
  \]
 If the $\mathbf{X}_{i}$s are all one-dimensional, then $\mvec{\mathbf{X}}$ are
 called mutually independent random variables.
\end{mdef}


\begin{mthm}[183]
Let $\mvec{X}$ be mutually independent random variables. Let $\mvec{g}$ be
real-valued functions such that $g_{i}(x_{i})$ is a function only of $x_{i}$,
$i=1,\ldots,n$. Then
\[
\pexp{g_{1}(X_{1})\cdots g_{n}(X_{n})}=\pexp{g_{1}(X_{1})}\cdots \pexp{g_{n}(X_{n})}.
\]
\end{mthm}

\begin{mthm}[183]
  Let $\mvec{X}$ be mutually independent random variables with mgfs
  $M_{X_{1}}(t),ldots,M_{X_{n}}(t)$. Let $Z=X_{1}+\cdots+X_{n}$. Then the mgf of
  $Z$ is
  \[
  M_{Z}(t)=M_{X_{1}}(t)\cdots M_{X_{n}}(t).
  \]

  In particular, if $\mvec{X}$ all have the same distribution with mgf
  $M_{X}(t)$, then
  \[
  M_{Z}(t)=(M_{X}(t))^{n}.
  \]
\end{mthm}

\begin{mcor}[183]
 Let $\mvec{X}$ be mutually independent random variables with mgfs
  $M_{X_{1}}(t),ldots,M_{X_{n}}(t)$. Let $\mvec{a}$ and $\mvec{b}$ be fixed
  constants. Let $Z=(a_{1}X_{1}+b_{1})+\cdots+(a_{n}X_{n}+b_{n})$. Then the mgf
  of $Z$ is
  \[
  M_{Z}(t)=e^{t(\sum_{b_{i}})}M_{X_{1}}(a_{1}t)\cdots M_{X_{n}}(a_{n}t).
  \]
\end{mcor}

\begin{mcor}[184]
A linear combination of independent normal random variables is normally
distributed. Let $\mvec{X}$ be mutually independent random variables with
$X_{i}\sim n(\mu_{i},\sigma_{i}^{2})$. Let $\mvec{a}$ and $\mvec{b}$ be fixed
constants. Then
\[
Z=\sum_{i=1}^{n}(a_{i}X_{i}+b_{i})\sim
n\left(\sum_{i=1}^{n}(a_{i}\mu_{i}+b_{i}),\sum_{i=1}^{n}a_{i}^{2}\sigma_{i}^{2}\right).
\]

\end{mcor}


\begin{mthm}[184]
  Let $\mvec{\mathbf{X}}$ be random vectors. Then $\mvec{\mathbf{X}}$ are
  mutually independent random vectors if and only if there exists functions
  $g_{i}(\mathbf{x}_{i})$, $i=1,\ldots,n$, such that the joint pdf or pmf of
  $(\mvec{\mathbf{X}})$ can be written as
  \[
  f(\mvec{\mathbf{x}})=g_{1}(\mathbf{x}_{1})\cdots g_{n}(\mathbf{x}_{n}).
  \]
\end{mthm}

\begin{mthm}[184]
  Let $\mvec{\mathbf{X}}$ be random vectors. Let $g_{i}(\mathbf{x_{i}})$ be a
  function only o $\mathbf{x_{i}}$, $i=1,\ldots,n$. Then the random variables
  $U_{i}=g_{i}(\mathbf{X}_{i})$, $i=1,\ldots,n$, are mutually independent.
\end{mthm}

\begin{mlem}[185]
  Let $a$ and $b$ be any positive numbers, and let $p$ and $q$ be any positive
  numbers (necessarily greater than $1$) satisfying
  \[
  \frac{1}{p}+\frac{1}{q}=1.
  \]
  Then
  \[
  \frac{1}{p}a^{p}+\frac{1}{q}b^{q}\geq ab
  \]
  with equality if and only if $a^{p}=b^{q}$.
\end{mlem}

\begin{mthm}[187]
\textbf{Holder's Inequality} Let $X$ and $Y$ be any two random variables and let
$p$ and $q$ satisfy $1/p+1/q=1$. Then
\[
|\pexp{XY}|\leq \pexp{|XY|} \leq \pexp{|X|^{p}}^{1/p}\pexp{|Y|^{q}}^{1/q}.
\]
\end{mthm}


\begin{mthm}[187]
\textbf{Cauchy-Schwarz Inequality} For any two random variables $X$ and $Y$,
\[
|\pexp{XY}|\leq \pexp{|XY|} \leq \pexp{|X|^{2}}^{1/2}\pexp{|Y|^{2}}^{1/2}.
\]
\end{mthm}

\begin{mthm}[188]
\textbf{Minkowski's Inequality} Let $X$ and $Y$ be any two random variables.
Then for $1<p<\infty$,
\[
\pexp{|X+Y|^{p}}^{1/p}\leq \pexp{|X|^{p}}^{1/p}+\pexp{|Y|^{p}}^{1/p}.
\]
\end{mthm}

\begin{mdef}[189]
A function $g(x)$ is convex if $g(\lambda x+(1-\lambda y))\leq\lambda
g(x)+(1-\lambda)g(y)$, for all $x$ and $y$ and $0<\lambda<1$. The function
$g(x)$ is concave if $-g(x)$ is convex.
\end{mdef}

\begin{mthm}[190]
\textbf{Jensen's Inequality} For any random variable $X$, if $g(x)$ is a convex function, then
\[
\pexp{g(X)}\geq g(\pexp{X}).
\]
Equality hold if and only if, for every line $a+bx$ that is tangent to $g(x)$ at
$x=\pexp{X}$, $P(g(X)=a+bX)=1$.
\end{mthm}


\begin{mthm}[192]
\textbf{Covariance Inequality} Let $X$ be any random variable and $g(x)$ and
$h(x)$ any functions such that $\pexp{g(X)},\pexp{h(X)}$, and $\pexp{g(X)h(X)}$
exist.
\begin{enumerate}
\item If $g(x)$ is a nondecreasing function and $h(x)$ is a nonincreasing
  function, then
  \[
  \pexp{g(X)h(X)}\leq\pexp{g(X)}\pexp{h(X)}.
  \]
\item If $g(x)$ and $h(x)$ are either both nondecreasing or both nonincreasing,
  then
  \[
  \pexp{g(X)h(X)}\geq\pexp{g(X)}\pexp{h(X)}.
  \]
\end{enumerate}
\end{mthm}

















\end{document}
